In [1]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from pandas_profiling import ProfileReport
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
In [2]:
movies_df=pd.read_csv("movies_metadata.csv")
ratings_df=pd.read_csv("ratings.csv")
C:\Users\jpras\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3146: DtypeWarning:

Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.

In [3]:
movies_df.head()
Out[3]:
adult belongs_to_collection budget genres homepage id imdb_id original_language original_title overview ... release_date revenue runtime spoken_languages status tagline title video vote_average vote_count
0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... http://toystory.disney.com/toy-story 862 tt0114709 en Toy Story Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] Released NaN Toy Story False 7.7 5415.0
1 False NaN 65000000 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... NaN 8844 tt0113497 en Jumanji When siblings Judy and Peter discover an encha... ... 1995-12-15 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released Roll the dice and unleash the excitement! Jumanji False 6.9 2413.0
2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... NaN 15602 tt0113228 en Grumpier Old Men A family wedding reignites the ancient feud be... ... 1995-12-22 0.0 101.0 [{'iso_639_1': 'en', 'name': 'English'}] Released Still Yelling. Still Fighting. Still Ready for... Grumpier Old Men False 6.5 92.0
3 False NaN 16000000 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... NaN 31357 tt0114885 en Waiting to Exhale Cheated on, mistreated and stepped on, the wom... ... 1995-12-22 81452156.0 127.0 [{'iso_639_1': 'en', 'name': 'English'}] Released Friends are the people who let you be yourself... Waiting to Exhale False 6.1 34.0
4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 [{'id': 35, 'name': 'Comedy'}] NaN 11862 tt0113041 en Father of the Bride Part II Just when George Banks has recovered from his ... ... 1995-02-10 76578911.0 106.0 [{'iso_639_1': 'en', 'name': 'English'}] Released Just When His World Is Back To Normal... He's ... Father of the Bride Part II False 5.7 173.0

5 rows × 24 columns

In [4]:
ratings_df.head()
Out[4]:
userId movieId rating timestamp
0 1 110 1.0 1425941529
1 1 147 4.5 1425942435
2 1 858 5.0 1425941523
3 1 1221 5.0 1425941546
4 1 1246 5.0 1425941556
In [5]:
movies_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  revenue                45460 non-null  float64
 16  runtime                45203 non-null  float64
 17  spoken_languages       45460 non-null  object 
 18  status                 45379 non-null  object 
 19  tagline                20412 non-null  object 
 20  title                  45460 non-null  object 
 21  video                  45460 non-null  object 
 22  vote_average           45460 non-null  float64
 23  vote_count             45460 non-null  float64
dtypes: float64(4), object(20)
memory usage: 8.3+ MB
In [6]:
ratings_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 794.2 MB
In [7]:
movies_df.describe()
Out[7]:
revenue runtime vote_average vote_count
count 4.546000e+04 45203.000000 45460.000000 45460.000000
mean 1.120935e+07 94.128199 5.618207 109.897338
std 6.433225e+07 38.407810 1.924216 491.310374
min 0.000000e+00 0.000000 0.000000 0.000000
25% 0.000000e+00 85.000000 5.000000 3.000000
50% 0.000000e+00 95.000000 6.000000 10.000000
75% 0.000000e+00 107.000000 6.800000 34.000000
max 2.787965e+09 1256.000000 10.000000 14075.000000
In [8]:
ratings_df.rating.describe()
Out[8]:
count    2.602429e+07
mean     3.528090e+00
std      1.065443e+00
min      5.000000e-01
25%      3.000000e+00
50%      3.500000e+00
75%      4.000000e+00
max      5.000000e+00
Name: rating, dtype: float64
In [9]:
ratings_df.isna().sum()
Out[9]:
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
In [10]:
movies_df.isna().sum()
Out[10]:
adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64
In [11]:
ratings_df.isna().sum()
Out[11]:
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
In [12]:
#movies_df.drop(['belongs_to_collection', 'homepage', 'tagline', 'poster_path', 'overview', 'spoken_languages'], inplace=True, axis=1)
In [13]:
movies_df.isna().sum()
Out[13]:
adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64
In [14]:
movies_df.columns
Out[14]:
Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')
In [15]:
movies_df['revenue'] = movies_df['revenue'].replace(0,np.nan)
In [16]:
movies_df['budget'] = pd.to_numeric(movies_df['budget'], errors='coerce')
movies_df['budget'] = movies_df['budget'].replace(0, np.nan)
movies_df[movies_df['budget'].isnull()].shape
Out[16]:
(36576, 24)
In [17]:
movies_df['return'] = movies_df['revenue']/movies_df['budget']
In [18]:
movies_df['release_date'] = pd.to_datetime(movies_df['release_date'], errors='coerce')
movies_df['year'] = pd.DatetimeIndex(movies_df['release_date']).year
movies_df['month'] = movies_df['release_date'].dt.month_name()
In [19]:
x = movies_df.apply(lambda x: pd.Series(x['production_countries']),axis=1).stack().reset_index(level=1, drop=True)
x.name = 'countries' #if we dont do this than we get error while joining as column wont have any name
In [22]:
country_data = movies_df.drop('production_countries', axis=1).join(x)
country_data = pd.DataFrame(country_data['countries'].value_counts())
country_data['country'] = country_data.index
country_data.columns = ['num_movies', 'country']
country_data = country_data.reset_index().drop('index', axis=1)
In [23]:
country_data.head(10)
Out[23]:
num_movies country
0 17851 [{'iso_3166_1': 'US', 'name': 'United States o...
1 6282 []
2 2238 [{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]
3 1654 [{'iso_3166_1': 'FR', 'name': 'France'}]
4 1356 [{'iso_3166_1': 'JP', 'name': 'Japan'}]
5 1030 [{'iso_3166_1': 'IT', 'name': 'Italy'}]
6 840 [{'iso_3166_1': 'CA', 'name': 'Canada'}]
7 749 [{'iso_3166_1': 'DE', 'name': 'Germany'}]
8 735 [{'iso_3166_1': 'RU', 'name': 'Russia'}]
9 735 [{'iso_3166_1': 'IN', 'name': 'India'}]
In [24]:
country_dada = country_data[country_data['country'] != 'UK']
In [28]:
data = dict(type = 'choropleth',
           locations = country_data['country'],
           colorscale = 'Portland',
           locationmode = 'country names',
           z = country_data['num_movies'],
           text = country_data['country'],
           marker = dict(line = dict(color = 'rgb(255,255,255)',width=1)),
           colorbar = {'title' : 'No of movies per country'})
In [29]:
layout = dict(title = 'Countries where movies is directed',
             geo = dict(showframe= False,
                       projection = {'type' : 'mercator'}))
In [30]:
choromaps = go.Figure(data=[data], layout=layout)
iplot(choromaps)
In [32]:
fran_data = movies_df[movies_df['belongs_to_collection'].notnull()]
In [33]:
fran_data['belongs_to_collection'] = fran_data['belongs_to_collection'].apply(ast.literal_eval).apply(lambda x: x['name'] if isinstance(x, dict) else np.nan)
<ipython-input-33-402f2c5d7dca>:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [34]:
fran_data = fran_data[fran_data['belongs_to_collection'].notnull()]
In [35]:
pivotfran_data = pd.pivot_table(fran_data,
                              index = 'belongs_to_collection',
                              values='revenue',
                              aggfunc={'revenue':['count','sum']}).reset_index()
pivotfran_data.sort_values
Out[35]:
<bound method DataFrame.sort_values of                      belongs_to_collection  count          sum
0                ... Has Fallen Collection      2  366780087.0
1                   00 Schneider Filmreihe      0          0.0
2                         08/15 Collection      0          0.0
3                     100 Girls Collection      0          0.0
4     101 Dalmatians (Animated) Collection      1  215880014.0
...                                    ...    ...          ...
1690                     Сказки Чуковского      0          0.0
1691             Чебурашка и крокодил Гена      0          0.0
1692       Что Творят мужчины! (Коллекция)      0          0.0
1693                           男はつらいよ シリーズ      0          0.0
1694                                식객 시리즈      0          0.0

[1695 rows x 3 columns]>
In [36]:
pivotfran_data.sort_values('sum', ascending=False).head(10)
Out[36]:
belongs_to_collection count sum
552 Harry Potter Collection 8 7.707367e+09
1160 Star Wars Collection 8 7.434495e+09
646 James Bond Collection 26 7.106970e+09
1317 The Fast and the Furious Collection 8 5.125099e+09
968 Pirates of the Caribbean Collection 5 4.521577e+09
1550 Transformers Collection 5 4.366101e+09
325 Despicable Me Collection 4 3.691070e+09
1491 The Twilight Collection 5 3.342107e+09
610 Ice Age Collection 5 3.216709e+09
666 Jurassic Park Collection 4 3.031484e+09
In [42]:
pivotfran_data.sort_values('count', ascending=False).head(10)
Out[42]:
belongs_to_collection count sum
646 James Bond Collection 26 7.106970e+09
473 Friday the 13th Collection 12 4.648985e+08
976 Pokémon Collection 11 6.983008e+08
552 Harry Potter Collection 8 7.707367e+09
540 Halloween Collection 8 2.471681e+08
29 A Nightmare on Elm Street Collection 8 3.635916e+08
1317 The Fast and the Furious Collection 8 5.125099e+09
1432 The Pink Panther (Original) Collection 8 1.644782e+08
1160 Star Wars Collection 8 7.434495e+09
977 Police Academy Collection 7 3.046432e+08
In [41]:
p = pivotfran_data.sort_values('count', ascending=False)
In [39]:
movies_df.boxplot()
Out[39]:
<AxesSubplot:>
In [44]:
fig1 = px.scatter(pivotfran_data.head(25), x="count", y="sum",size="sum", color="belongs_to_collection",
                 hover_name="count", log_x=True, size_max=60)
fig1.show()
In [45]:
#lets convert to string type
movies_df['title']=movies_df['title'].astype('str')
#movies_df['overview']=movies_df['overview'].astype('str')

#lets join titles continuousy seperated by space
title_data=' '.join(movies_df['title'])
#overview_data=' '.join(movies_df['overview'])
In [46]:
languages = movies_df["original_language"].value_counts()
langues_df = pd.DataFrame({'languages':languages.index, 'frequency':languages.values}).head(10)

fig = px.bar(langues_df, x="frequency", y="languages",color='languages', orientation='h',
             hover_data=["languages", "frequency"],
             height=1000,
             title='Language which has more Movies')
fig.show()
In [47]:
top_movies = movies_df[["title","vote_count"]]
top_movies = top_movies.sort_values(by="vote_count",ascending=False)
fig = px.bar(data_frame=top_movies[:20],x="title",y="vote_count",color="title",title="Most Voted Movies")
fig.show()
In [48]:
s = movies_df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = movies_df.drop('genres', axis=1).join(s)
In [49]:
genre_counts = gen_md.genre.value_counts()
genre_df = pd.DataFrame({'genre':genre_counts.index,"count":genre_counts.values})
fig = px.bar(data_frame=genre_df[:10],x="genre",y="count",color="genre")
fig.show()
In [50]:
movies_df['adult']
Out[50]:
0        False
1        False
2        False
3        False
4        False
         ...  
45461    False
45462    False
45463    False
45464    False
45465    False
Name: adult, Length: 45466, dtype: object
In [52]:
adults_count = movies_df['adult'].value_counts()
adults_df = pd.DataFrame({"adults":adults_count.index,"count":adults_count.values})
fig = px.bar(data_frame=adults_df[:2],x="adults",y="count",color="adults")
fig.show()
In [53]:
budget_anlys = movies_df.sort_values(by="budget",ascending=False)
budget_anlys[["budget",'title']].head(10)
Out[53]:
budget title
17124 380000000.0 Pirates of the Caribbean: On Stranger Tides
11827 300000000.0 Pirates of the Caribbean: At World's End
26558 280000000.0 Avengers: Age of Ultron
11067 270000000.0 Superman Returns
44842 260000000.0 Transformers: The Last Knight
16130 260000000.0 Tangled
18685 260000000.0 John Carter
11780 258000000.0 Spider-Man 3
21175 255000000.0 The Lone Ranger
22059 250000000.0 The Hobbit: The Desolation of Smaug
In [ ]: